import warnings
import sys, os
warnings.filterwarnings("ignore")

def load_data(dataset, dataset_folder="/data/shared/zhexu/"):
    if dataset == 'cora':
        from raw_data_utils.load_cora import get_raw_text
    elif dataset == 'pubmed':
        from raw_data_utils.load_pubmed import get_raw_text
    elif dataset == 'ogbn-arxiv':
        from raw_data_utils.load_arxiv import get_raw_text
    elif dataset == 'ogbn-products':
        from raw_data_utils.load_products import get_raw_text
    else:
        exit(f'Error: Dataset {dataset} not supported')
    
    return get_raw_text(dataset_folder=dataset_folder)

def generate_title_content_label(output_folder, dataset, dataset_folder):
    _, text, _ = load_data(dataset, dataset_folder)
    title = text['title']
    content = text['content']
    label = text['label']
    ensure_folder_exists(f'{output_folder}{dataset}')
    with open(f'{output_folder}{dataset}/{dataset}_title_list.txt', 'w') as fout:
        for i in title:
            fout.write(i.replace('\n', ' ').replace('Title: ', ' ').replace('  ', ' ').strip())
            fout.write('\n')
    with open(f'{output_folder}{dataset}/{dataset}_content_list.txt', 'w') as fout:
        for i in content:
            fout.write(i.replace('\n', ' ').replace('Abstract: ', ' ').replace('  ', ' ').strip())
            fout.write('\n')
    with open(f'{output_folder}{dataset}/{dataset}_label_list.txt', 'w') as fout:
        for i in label:
            fout.write(i.replace('\n', ' ').replace('  ', ' ').strip())
            fout.write('\n')
    
    print(len(title), len(content), len(label))

def ensure_folder_exists(folder_path):
    os.makedirs(folder_path, exist_ok=True)

dataset_folder = "../raw_data/"
output_folder = "../processed_data/"

datasets = ['cora']
for dataset in datasets:
    print(dataset)
    generate_title_content_label(output_folder, dataset, dataset_folder)